GitHub Repository: debakarr/machinelearning
Path: blob/master/Part 10 - Model Selection And Boosting/Grid Search/[Python] Grid Search.ipynb
¹³³⁶ views

Kernel: Python 3

Grid Search

Data preprocessing

In [1]:

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split # for training and testing split
from sklearn.preprocessing import StandardScaler # for Feature scaling
from sklearn.svm import SVC # for classifier
from sklearn.metrics import confusion_matrix # for making confusion matrix
from matplotlib.colors import ListedColormap # for visualisation
%matplotlib inline
plt.rcParams['figure.figsize'] = [14, 8]

In [2]:

# Importing the dataset
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values

In [3]:

dataset.head(10)

Out[3]:

In [4]:

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [5]:

X_train[0:10]

Out[5]:

array([[  2.70000000e+01,   5.70000000e+04],
       [  4.60000000e+01,   2.80000000e+04],
       [  3.90000000e+01,   1.34000000e+05],
       [  4.40000000e+01,   3.90000000e+04],
       [  5.70000000e+01,   2.60000000e+04],
       [  3.20000000e+01,   1.20000000e+05],
       [  4.10000000e+01,   5.20000000e+04],
       [  4.80000000e+01,   7.40000000e+04],
       [  2.60000000e+01,   8.60000000e+04],
       [  2.20000000e+01,   8.10000000e+04]])

In [6]:

X_test[0:10]

Out[6]:

array([[  4.60000000e+01,   2.20000000e+04],
       [  5.90000000e+01,   8.80000000e+04],
       [  2.80000000e+01,   4.40000000e+04],
       [  4.80000000e+01,   9.60000000e+04],
       [  2.90000000e+01,   2.80000000e+04],
       [  3.00000000e+01,   6.20000000e+04],
       [  4.70000000e+01,   1.07000000e+05],
       [  2.90000000e+01,   8.30000000e+04],
       [  4.00000000e+01,   7.50000000e+04],
       [  4.20000000e+01,   6.50000000e+04]])

In [7]:

y_train[0:10]

Out[7]:

array([0, 1, 1, 0, 1, 1, 0, 1, 0, 0])

In [8]:

y_test[0:10]

Out[8]:

array([0, 1, 0, 1, 0, 0, 1, 0, 0, 0])

In [9]:

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [10]:

X_train[0:10]

Out[10]:

array([[-1.06675246, -0.38634438],
       [ 0.79753468, -1.22993871],
       [ 0.11069205,  1.853544  ],
       [ 0.60129393, -0.90995465],
       [ 1.87685881, -1.28811763],
       [-0.57615058,  1.44629156],
       [ 0.3069328 , -0.53179168],
       [ 0.99377543,  0.10817643],
       [-1.16487283,  0.45724994],
       [-1.55735433,  0.31180264]])

In [11]:

X_test[0:10]

Out[11]:

array([[ 0.79753468, -1.40447546],
       [ 2.07309956,  0.51542886],
       [-0.96863208, -0.76450736],
       [ 0.99377543,  0.74814454],
       [-0.87051171, -1.22993871],
       [-0.77239133, -0.24089709],
       [ 0.89565505,  1.06812859],
       [-0.87051171,  0.36998156],
       [ 0.20881242,  0.13726589],
       [ 0.40505317, -0.15362871]])

Fitting Kernel SVM classifier to the Training set

In [12]:

classifier = SVC(kernel = 'rbf', random_state = 42) # Kernel SVM is non-linear classifier
classifier.fit(X_train, y_train)

Out[12]:

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False)

Predicting the Test set results

In [13]:

y_pred = classifier.predict(X_test)

In [14]:

y_pred[0:10]

Out[14]:

array([1, 1, 0, 1, 0, 0, 1, 0, 0, 0])

In [15]:

y_test[0:10]

Out[15]:

array([0, 1, 0, 1, 0, 0, 1, 0, 0, 0])

Applying k-Fold Cross Validation

In [16]:

from sklearn.model_selection import cross_val_score

In [17]:

help(cross_val_score)

Out[17]:

Help on function cross_val_score in module sklearn.model_selection._validation:

cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs')
    Evaluate a score by cross-validation
    
    Read more in the :ref:`User Guide <cross_validation>`.
    
    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.
    
    X : array-like
        The data to fit. Can be for example a list, or an array.
    
    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    
    groups : array-like, with shape (n_samples,), optional
        Group labels for the samples used while splitting the dataset into
        train/test set.
    
    scoring : string, callable or None, optional, default: None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    
    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
    
        - None, to use the default 3-fold cross validation,
        - integer, to specify the number of folds in a `(Stratified)KFold`,
        - An object to be used as a cross-validation generator.
        - An iterable yielding train, test splits.
    
        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` is used. In all
        other cases, :class:`KFold` is used.
    
        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.
    
    n_jobs : integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.
    
    verbose : integer, optional
        The verbosity level.
    
    fit_params : dict, optional
        Parameters to pass to the fit method of the estimator.
    
    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:
    
            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs
    
            - An int, giving the exact number of total jobs that are
              spawned
    
            - A string, giving an expression as a function of n_jobs,
              as in '2*n_jobs'
    
    Returns
    -------
    scores : array of float, shape=(len(list(cv)),)
        Array of scores of the estimator for each run of the cross validation.
    
    Examples
    --------
    >>> from sklearn import datasets, linear_model
    >>> from sklearn.model_selection import cross_val_score
    >>> diabetes = datasets.load_diabetes()
    >>> X = diabetes.data[:150]
    >>> y = diabetes.target[:150]
    >>> lasso = linear_model.Lasso()
    >>> print(cross_val_score(lasso, X, y))  # doctest: +ELLIPSIS
    [ 0.33150734  0.08022311  0.03531764]
    
    See Also
    ---------
    :func:`sklearn.model_selection.cross_validate`:
        To run cross-validation on multiple metrics and also to return
        train scores, fit times and score times.
    
    :func:`sklearn.metrics.make_scorer`:
        Make a scorer from a performance metric or loss function.

In [18]:

accuracies = cross_val_score(estimator = classifier, 
                             X = X_train,
                             y = y_train,
                             cv = 10)

In [19]:

accuracies # 10 test set accuracies

Out[19]:

array([ 0.96969697,  0.87878788,  0.93939394,  0.96969697,  0.90909091,
        0.87096774,  0.87096774,  0.87096774,  0.83870968,  0.93548387])

In [20]:

np.mean(accuracies) # mean of accuracies

Out[20]:

0.90537634408602141

In [21]:

np.std(accuracies) # startdard deviation of accuracies

Out[21]:

0.043652022122493701

This signifies that we are in Low Bias Low Variance category in Bias-Variance TradeOff.

Applying Grid Search to find the best model and the best parameters

In [22]:

from sklearn.model_selection import GridSearchCV

In [23]:

parameters = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.5, 0.1, 0.01, 0.001, 0.0001]}]

grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)

In [24]:

best_accuracy = grid_search.best_score_
best_accuracy

Out[24]:

0.90625

In [25]:

best_parameters = grid_search.best_params_
best_parameters

Out[25]:

{'C': 1, 'gamma': 0.5, 'kernel': 'rbf'}

Making the Confusion Matrix

In [26]:

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

Out[26]:

array([[47,  5],
       [ 1, 27]])

classifier made 47 + 27 = 74 correct prediction and 5 + 1 = 6 incoreect predictions.

Visualising the Training set results

In [27]:

X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j, edgecolors = 'white', linewidth = 0.7)
plt.title('Kernel SVM (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

Out[27]:

Visualising the Test set results

In [28]:

X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j, edgecolors = 'white', linewidth = 0.7)
plt.title('Kernel SVM (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

Out[28]:

Looks like it is much better the Linear kernel.